Notebook following this tutorial: https://blogs.rstudio.com/tensorflow/posts/2018-06-25-sunspots-lstm/

knitr::opts_chunk$set(eval = T, comment = "", results = "hold", message = F, tidy = T, dpi = 360)

Setup

library(tidyverse)
library(glue)

library(timetk)
library(tidyquant)
library(tibbletime)

library(cowplot)

library(recipes)
library(rsample)
library(yardstick)

library(keras)
library(tfruns)

Get the data

sun_spots <- datasets::sunspot.month %>% tk_tbl() %>% mutate(index = as_date(index)) %>% 
    as_tbl_time(index = index)

sun_spots
# A time tibble: 3,177 x 2
# Index: index
   index      value
   <date>     <dbl>
 1 1749-01-01  58  
 2 1749-02-01  62.6
 3 1749-03-01  70  
 4 1749-04-01  55.7
 5 1749-05-01  85  
 6 1749-06-01  83.5
 7 1749-07-01  94.8
 8 1749-08-01  66.3
 9 1749-09-01  75.9
10 1749-10-01  75.5
# ... with 3,167 more rows

Plots

p1 <- sun_spots %>% ggplot(aes(index, value)) + geom_point(color = palette_light()[[1]], 
    alpha = 0.5) + theme_tq() + labs(title = "From 1749 to 2013 (full data)")

p2 <- sun_spots %>% filter_time("start" ~ "1800") %>% ggplot(aes(index, value)) + 
    geom_line(color = palette_light()[[1]], alpha = 0.5) + geom_point(color = palette_light()[[1]]) + 
    geom_smooth(method = "loess", span = 0.2, se = F) + theme_tq() + labs(title = "1749 to 1759 (Zoomed In To Show Changes over the Year)", 
    caption = "datasets::sunspot.month")

p_title <- ggdraw() + draw_label("Sunspots", size = 18, fontface = "bold", colour = palette_light()[[1]])  # Not color

plot_grid(p_title, p1, p2, ncol = 1, rel_heights = c(0.1, 1, 1))

Backtesting

periods_train <- 12 * 100
periods_test <- 12 * 50
skip_span <- 12 * 22

rolling_origin_resamples <- rolling_origin(sun_spots, initial = periods_train, 
    assess = periods_test, cumulative = F, skip = skip_span)

rolling_origin_resamples

Rolling origin forecast resampling

A tibble: 6 x 2

splits id
1 <S3: rsplit> Slice1 2 <S3: rsplit> Slice2 3 <S3: rsplit> Slice3 4 <S3: rsplit> Slice4 5 <S3: rsplit> Slice5 6 <S3: rsplit> Slice6

plot_split <- function(split, expand_y_axis = T, alpha = 1, size = 1, base_size = 14) {
    
    train_tbl <- training(split) %>% add_column(key = "training")
    
    test_tbl <- testing(split) %>% add_column(key = "testing")
    
    data_manipulated <- bind_rows(train_tbl, test_tbl) %>% as_tbl_time(index = index) %>% 
        mutate(key = fct_relevel(key, "training", "testing"))
    
    train_time_summary <- train_tbl %>% tk_index() %>% tk_get_timeseries_summary()
    
    test_time_summary <- test_tbl %>% tk_index() %>% tk_get_timeseries_summary()
    
    g <- data_manipulated %>% ggplot() + aes(x = index, y = value, color = key) + 
        geom_line(size = size, alpha = alpha) + theme_tq(base_size = base_size) + 
        scale_color_tq() + labs(title = glue("Split: {split$id}"), subtitle = glue("{train_time_summary$start} to ", 
        "{test_time_summary$end}"), y = "", x = "") + theme(legend.position = "none")
    
    if (expand_y_axis) {
        sun_spots_time_summary <- sun_spots %>% tk_index() %>% tk_get_timeseries_summary()
        
        g <- g + scale_x_date(limits = c(sun_spots_time_summary$start, sun_spots_time_summary$end))
        
    }
    
    g
    
}
rolling_origin_resamples$splits[[1]] %>% plot_split(expand_y_axis = T) + theme(legend.position = "bottom")

# Plotting function that scales to all splits
plot_sampling_plan <- function(sampling_tbl, expand_y_axis = TRUE, ncol = 3, 
    alpha = 1, size = 1, base_size = 14, title = "Sampling Plan") {
    
    # Map plot_split() to sampling_tbl
    sampling_tbl_with_plots <- sampling_tbl %>% mutate(gg_plots = map(splits, 
        plot_split, expand_y_axis = expand_y_axis, alpha = alpha, base_size = base_size))
    
    # Make plots with cowplot
    plot_list <- sampling_tbl_with_plots$gg_plots
    
    p_temp <- plot_list[[1]] + theme(legend.position = "bottom")
    legend <- get_legend(p_temp)
    
    p_body <- plot_grid(plotlist = plot_list, ncol = ncol)
    
    p_title <- ggdraw() + draw_label(title, size = 14, fontface = "bold", colour = palette_light()[[1]])
    
    g <- plot_grid(p_title, p_body, legend, ncol = 1, rel_heights = c(0.05, 
        1, 0.05))
    
    g
    
}
rolling_origin_resamples %>% plot_sampling_plan(expand_y_axis = T, ncol = 3, 
    alpha = 1, size = 1, base_size = 10, title = "Backtesting Strategy: Rolling Origin Sampling Plan")

rolling_origin_resamples %>% plot_sampling_plan(expand_y_axis = F, ncol = 3, 
    alpha = 1, size = 1, base_size = 10, title = "Backtesting Strategy: Zoomed In")

LSTM

Take one split only

example_split <- rolling_origin_resamples$splits[[6]]
example_split_id <- rolling_origin_resamples$id[[6]]
plot_split(example_split, expand_y_axis = F, size = 0.5) + theme(legend.position = "bottom") + 
    ggtitle(glue("Split: {example_split_id}"))

df_trn <- analysis(example_split)[1:800, , drop = F]
df_val <- analysis(example_split)[801:1200, , drop = F]
df_tst <- assessment(example_split)
df <- bind_rows(df_trn %>% add_column(key = "training"), df_val %>% add_column(key = "validation"), 
    df_tst %>% add_column(key = "testing")) %>% as_tbl_time(index = index)

df
# A time tibble: 1,800 x 3
# Index: index
   index      value key     
   <date>     <dbl> <chr>   
 1 1859-06-01  87.1 training
 2 1859-07-01  95.2 training
 3 1859-08-01 107.  training
 4 1859-09-01 106.  training
 5 1859-10-01 115.  training
 6 1859-11-01  97.2 training
 7 1859-12-01  81   training
 8 1860-01-01  82.4 training
 9 1860-02-01  88.3 training
10 1860-03-01  98.9 training
# ... with 1,790 more rows
rec_obj <- recipe(value ~ ., df) %>% step_sqrt(value) %>% step_center(value) %>% 
    step_scale(value) %>% prep()